# Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# ============================================================================
# Load data
Alzheimer <- read.csv('project data.csv')
head(Alzheimer)
## Group M.F Age EDUC SES MMSE CDR eTIV nWBV ASF
## 1 Nondemented M 87 14 2 27 0.0 1987 0.696 0.883
## 2 Nondemented M 88 14 2 30 0.0 2004 0.681 0.876
## 3 Demented M 75 12 NA 23 0.5 1678 0.736 1.046
## 4 Demented M 76 12 NA 28 0.5 1738 0.713 1.010
## 5 Demented M 80 12 NA 22 0.5 1698 0.701 1.034
## 6 Nondemented F 88 18 3 28 0.0 1215 0.710 1.444
str(Alzheimer)
## 'data.frame': 373 obs. of 10 variables:
## $ Group: chr "Nondemented" "Nondemented" "Demented" "Demented" ...
## $ M.F : chr "M" "M" "M" "M" ...
## $ Age : int 87 88 75 76 80 88 90 80 83 85 ...
## $ EDUC : int 14 14 12 12 12 18 18 12 12 12 ...
## $ SES : int 2 2 NA NA NA 3 3 4 4 4 ...
## $ MMSE : int 27 30 23 28 22 28 27 28 29 30 ...
## $ CDR : num 0 0 0.5 0.5 0.5 0 0 0 0.5 0 ...
## $ eTIV : int 1987 2004 1678 1738 1698 1215 1200 1689 1701 1699 ...
## $ nWBV : num 0.696 0.681 0.736 0.713 0.701 0.71 0.718 0.712 0.711 0.705 ...
## $ ASF : num 0.883 0.876 1.046 1.01 1.034 ...
# Preliminary Analysis
# Convert M/F into numeric values
Alzheimer$M.F <- ifelse(Alzheimer$M.F == 'M', 1,
ifelse(Alzheimer$M.F == 'F', 0, NA))
# Confirm the conversion
head(Alzheimer)
## Group M.F Age EDUC SES MMSE CDR eTIV nWBV ASF
## 1 Nondemented 1 87 14 2 27 0.0 1987 0.696 0.883
## 2 Nondemented 1 88 14 2 30 0.0 2004 0.681 0.876
## 3 Demented 1 75 12 NA 23 0.5 1678 0.736 1.046
## 4 Demented 1 76 12 NA 28 0.5 1738 0.713 1.010
## 5 Demented 1 80 12 NA 22 0.5 1698 0.701 1.034
## 6 Nondemented 0 88 18 3 28 0.0 1215 0.710 1.444
# Remove rows with Group = 'Converted'
Alzheimer <- Alzheimer %>%
filter(Group != 'Converted')
# Remove missing values
Alzheimer <- na.omit(Alzheimer)
# Analysis
# Generate summary of Alzheimer
summary(Alzheimer)
## Group M.F Age EDUC
## Length:317 Min. :0.0000 Min. :60.00 Min. : 6.00
## Class :character 1st Qu.:0.0000 1st Qu.:71.00 1st Qu.:12.00
## Mode :character Median :0.0000 Median :76.00 Median :15.00
## Mean :0.4322 Mean :76.72 Mean :14.62
## 3rd Qu.:1.0000 3rd Qu.:82.00 3rd Qu.:16.00
## Max. :1.0000 Max. :98.00 Max. :23.00
## SES MMSE CDR eTIV
## Min. :1.000 Min. : 4.00 Min. :0.0000 Min. :1106
## 1st Qu.:2.000 1st Qu.:27.00 1st Qu.:0.0000 1st Qu.:1358
## Median :2.000 Median :29.00 Median :0.0000 Median :1476
## Mean :2.546 Mean :27.26 Mean :0.2729 Mean :1494
## 3rd Qu.:3.000 3rd Qu.:30.00 3rd Qu.:0.5000 3rd Qu.:1599
## Max. :5.000 Max. :30.00 Max. :2.0000 Max. :2004
## nWBV ASF
## Min. :0.6440 Min. :0.876
## 1st Qu.:0.7000 1st Qu.:1.098
## Median :0.7320 Median :1.189
## Mean :0.7306 Mean :1.192
## 3rd Qu.:0.7570 3rd Qu.:1.293
## Max. :0.8370 Max. :1.587
# ========================================================================
# Select all numeric variables
attach(Alzheimer)
numeric_vars <- c('Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF')
numeric_vars
## [1] "Age" "EDUC" "SES" "MMSE" "CDR" "eTIV" "nWBV" "ASF"
# Find standard deviation of variables
sds <- apply(Alzheimer[, numeric_vars], 2, sd)
print(sds)
## Age EDUC SES MMSE CDR eTIV
## 7.80507137 2.92687640 1.12309861 3.86122732 0.38214372 179.71907893
## nWBV ASF
## 0.03810197 0.13966275
# Create appropriate plots
ggplot(Alzheimer,
aes(x = Group, y = Age, fill = as.factor(M.F))) +
geom_boxplot() +
labs(x = 'Group', y = 'Age',
title = paste('Boxplot of Demented and Nondemented based on Age and Gender'),
fill = 'Gender (M.F)') +
scale_fill_manual(
values = c("0" = "tomato1", "1" = "lightseagreen" ),
labels = c("0" = "Female", "1" = "Male"))

# Convert M.F to factor
Alzheimer$M.F <- as.factor(ifelse(M.F == 1, 'Male', 'Female'))
gender_G <- ggplot(Alzheimer,
aes(x = M.F,
fill = Group)) +
geom_bar(position = 'dodge', color = 'black') +
geom_text(aes(label = ..count..), stat = 'count', vjust = 0.5, colour = 'black') +
labs(x = 'Gender', y = 'Frequency',
title = paste('Barchart of Gender by Demented vs Nondemented'))
ggplotly(gender_G)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
## Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ========================================================================
# Convert Group variable to numeric values
Alzheimer$Group <- ifelse(Alzheimer$Group == 'Demented', 1,
ifelse(Alzheimer$Group == 'Nondemented', 0, NA))
# Convert M/F into numeric values
Alzheimer$M.F <- ifelse(Alzheimer$M.F == 'Male', 1,
ifelse(Alzheimer$M.F == 'Female', 0, NA))
head(Alzheimer)
## Group M.F Age EDUC SES MMSE CDR eTIV nWBV ASF
## 1 0 1 87 14 2 27 0.0 1987 0.696 0.883
## 2 0 1 88 14 2 30 0.0 2004 0.681 0.876
## 6 0 0 88 18 3 28 0.0 1215 0.710 1.444
## 7 0 0 90 18 3 27 0.0 1200 0.718 1.462
## 8 0 1 80 12 4 28 0.0 1689 0.712 1.039
## 9 0 1 83 12 4 29 0.5 1701 0.711 1.032
# Similarity measure
distance.Euclidean <- get_dist(Alzheimer)
fviz_dist(distance.Euclidean,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

distance.corr <- get_dist(Alzheimer, stand = TRUE, method = "pearson")
fviz_dist(distance.corr,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# Standardize features
scaled_A_vars <- scale(Alzheimer)
# Determining the optimal number of clusters
fviz_nbclust(scaled_A_vars, kmeans, method = "wss")+
geom_vline(xintercept = 3, linetype = 2)

# K-Means Clustering
set.seed(123)
kmeans2 <- kmeans(scaled_A_vars, centers = 2, nstart = 20)
kmeans3 <- kmeans(scaled_A_vars, centers = 3, nstart = 20)
kmeans4 <- kmeans(scaled_A_vars, centers = 4, nstart = 20)
kmeans3
## K-means clustering with 3 clusters of sizes 102, 138, 77
##
## Cluster means:
## Group M.F Age EDUC SES MMSE
## 1 1.2212058 0.2552531 -0.01010028 -0.5384328 0.430657146 -0.9691794
## 2 -0.7276933 -0.6811764 -0.17716110 -0.0244847 0.004437111 0.5027078
## 3 -0.3135235 0.8826822 0.33088909 0.7571303 -0.578433119 0.3828912
## CDR eTIV nWBV ASF
## 1 1.1331121 -0.2233269 -0.5233154 0.1817558
## 2 -0.6571651 -0.5477012 0.5301795 0.5347985
## 3 -0.3232292 1.2774301 -0.2569688 -1.1992374
##
## Clustering vector:
## 1 2 6 7 8 9 10 14 15 16 17 18 19 20 21 22 23 24 25 26
## 3 3 2 2 3 3 3 2 2 1 1 2 1 2 2 2 2 2 2 1
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 1 1 1 3 3 3 3 3 3 2 2 1 1 1 1 2 2 2 2 1
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
## 1 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1
## 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
## 1 2 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 2 2 1
## 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 1 1 1 1 2 2 2 1 1 3 3 3 3 3 2 2 2 2 2 2
## 107 108 109 110 111 112 113 114 115 116 117 118 119 120 123 124 125 126 127 128
## 2 2 2 1 1 1 1 1 3 3 3 3 3 3 1 1 1 3 3 2
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
## 2 2 2 2 2 2 2 3 3 3 3 3 1 1 2 2 2 2 2 1
## 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
## 1 2 2 2 3 3 3 3 3 1 1 1 1 1 1 1 1 2 2 2
## 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
## 2 1 1 1 1 1 1 3 3 3 2 2 2 2 3 3 3 2 2 3
## 189 190 191 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
## 3 1 1 2 2 2 2 2 2 1 1 1 1 1 3 3 1 1 3 3
## 211 212 213 214 215 216 217 218 221 222 223 224 225 226 227 228 229 230 231 232
## 3 3 1 1 1 1 1 1 1 1 3 3 3 3 2 2 2 1 1 2
## 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 2 2 2 1 1 2 2 2 2 2 2 2 2 1 1 3 3 3 3 2
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
## 2 2 2 1 1 1 2 2 2 2 3 3 3 1 1 2 2 2 2 2
## 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 291 292 293 294
## 2 1 1 2 2 2 2 3 2 2 1 1 2 2 2 2 3 3 3 1
## 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
## 1 3 1 1 1 2 2 3 3 3 3 3 2 2 2 3 3 3 2 2
## 315 316 317 318 319 325 326 327 328 329 330 331 332 333 334 335 336
## 2 2 2 3 3 2 2 2 2 1 1 3 3 3 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 792.3855 693.8160 441.0180
## (between_SS / total_SS = 39.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# To visualise the results the fviz_cluster function can be used:
fviz_cluster(kmeans2, data = scaled_A_vars, stand = FALSE)

fviz_cluster(kmeans3, data = scaled_A_vars, stand = FALSE)

fviz_cluster(kmeans4, data = scaled_A_vars, stand = FALSE)

f1 <- fviz_cluster(kmeans2,
geom = "point", data = scaled_A_vars) + ggtitle("k = 2")
f2 <- fviz_cluster(kmeans3,
geom = "point", data = scaled_A_vars) + ggtitle("k = 3")
f3 <- fviz_cluster(kmeans4,
geom = "point", data = scaled_A_vars) + ggtitle("k = 4")
grid.arrange(f1, f2, f3, nrow = 2)

# ========================================================================
# Implement feature selection on the data set
attach(Alzheimer)
## The following objects are masked from Alzheimer (pos = 3):
##
## Age, ASF, CDR, EDUC, eTIV, Group, M.F, MMSE, nWBV, SES
y_Group <- as.numeric(Alzheimer[,1])
X <- Alzheimer[,2:10]
model1 <- glm(y_Group~.,data=X)
summary(model1)
##
## Call:
## glm(formula = y_Group ~ ., data = X)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.283e-01 1.560e+00 0.339 0.7351
## M.F 1.602e-01 3.447e-02 4.649 4.95e-06 ***
## Age -3.523e-03 2.135e-03 -1.650 0.0999 .
## EDUC -1.086e-02 6.920e-03 -1.570 0.1175
## SES 1.039e-02 1.800e-02 0.577 0.5642
## MMSE 5.481e-03 5.274e-03 1.039 0.2995
## CDR 1.056e+00 5.234e-02 20.169 < 2e-16 ***
## eTIV -2.187e-05 5.144e-04 -0.043 0.9661
## nWBV -9.447e-01 4.803e-01 -1.967 0.0501 .
## ASF 4.122e-01 6.520e-01 0.632 0.5277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.05730189)
##
## Null deviance: 76.120 on 316 degrees of freedom
## Residual deviance: 17.592 on 307 degrees of freedom
## AIC: 5.0092
##
## Number of Fisher Scoring iterations: 2
step1 <- step(model1,method="backward")
## Start: AIC=5.01
## y_Group ~ M.F + Age + EDUC + SES + MMSE + CDR + eTIV + nWBV +
## ASF
##
## Df Deviance AIC
## - eTIV 1 17.592 3.011
## - SES 1 17.611 3.353
## - ASF 1 17.615 3.422
## - MMSE 1 17.654 4.122
## <none> 17.592 5.009
## - EDUC 1 17.733 5.544
## - Age 1 17.748 5.808
## - nWBV 1 17.813 6.980
## - M.F 1 18.830 24.578
## - CDR 1 40.901 270.474
##
## Step: AIC=3.01
## y_Group ~ M.F + Age + EDUC + SES + MMSE + CDR + nWBV + ASF
##
## Df Deviance AIC
## - SES 1 17.611 1.354
## - MMSE 1 17.654 2.124
## <none> 17.592 3.011
## - EDUC 1 17.736 3.595
## - Age 1 17.751 3.866
## - nWBV 1 17.815 5.000
## - ASF 1 18.326 13.973
## - M.F 1 18.866 23.185
## - CDR 1 40.946 268.816
##
## Step: AIC=1.35
## y_Group ~ M.F + Age + EDUC + MMSE + CDR + nWBV + ASF
##
## Df Deviance AIC
## - MMSE 1 17.670 0.414
## <none> 17.611 1.354
## - Age 1 17.770 2.212
## - nWBV 1 17.827 3.223
## - EDUC 1 18.058 7.311
## - ASF 1 18.410 13.431
## - M.F 1 18.938 22.381
## - CDR 1 40.951 266.861
##
## Step: AIC=0.41
## y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF
##
## Df Deviance AIC
## <none> 17.670 0.41
## - Age 1 17.802 0.78
## - nWBV 1 17.845 1.55
## - EDUC 1 18.084 5.76
## - ASF 1 18.460 12.27
## - M.F 1 18.995 21.34
## - CDR 1 55.051 358.65
summary(step1)
##
## Call:
## glm(formula = y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF,
## data = X)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.546490 0.484185 1.129 0.259906
## M.F 0.162088 0.033617 4.822 2.24e-06 ***
## Age -0.003178 0.002085 -1.524 0.128482
## EDUC -0.013056 0.004843 -2.696 0.007405 **
## CDR 1.020255 0.039840 25.609 < 2e-16 ***
## nWBV -0.813532 0.463650 -1.755 0.080311 .
## ASF 0.448722 0.120551 3.722 0.000234 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.05699945)
##
## Null deviance: 76.12 on 316 degrees of freedom
## Residual deviance: 17.67 on 310 degrees of freedom
## AIC: 0.41434
##
## Number of Fisher Scoring iterations: 2
# y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF (Features selected)
model2 <- lm(y_Group~1,data=X)
step2 <- step(model2,
scope=~ M.F + Age + EDUC + SES + MMSE + CDR+ eTIV + nWBV + ASF,
method="forward")
## Start: AIC=-450.23
## y_Group ~ 1
##
## Df Sum of Sq RSS AIC
## + CDR 1 56.023 20.097 -870.39
## + MMSE 1 29.571 46.549 -604.13
## + nWBV 1 8.354 67.765 -485.08
## + M.F 1 5.730 70.389 -473.04
## + EDUC 1 3.703 72.417 -464.04
## + SES 1 2.065 74.055 -456.95
## <none> 76.120 -450.23
## + Age 1 0.219 75.901 -449.14
## + eTIV 1 0.013 76.107 -448.28
## + ASF 1 0.002 76.118 -448.24
##
## Step: AIC=-870.39
## y_Group ~ CDR
##
## Df Sum of Sq RSS AIC
## + M.F 1 0.700 19.398 -879.62
## + EDUC 1 0.668 19.429 -879.10
## + SES 1 0.623 19.474 -878.37
## + ASF 1 0.189 19.908 -871.38
## + eTIV 1 0.176 19.921 -871.17
## <none> 20.097 -870.39
## + nWBV 1 0.062 20.036 -869.36
## + Age 1 0.049 20.048 -869.17
## + MMSE 1 0.000 20.097 -868.39
## - CDR 1 56.023 76.120 -450.23
##
## Step: AIC=-879.62
## y_Group ~ CDR + M.F
##
## Df Sum of Sq RSS AIC
## + eTIV 1 1.157 18.240 -897.12
## + ASF 1 1.144 18.253 -896.89
## + EDUC 1 0.778 18.619 -890.60
## + SES 1 0.687 18.710 -889.06
## <none> 19.398 -879.62
## + Age 1 0.033 19.364 -878.16
## + nWBV 1 0.015 19.383 -877.86
## + MMSE 1 0.001 19.397 -877.63
## - M.F 1 0.700 20.097 -870.39
## - CDR 1 50.992 70.389 -473.04
##
## Step: AIC=-897.12
## y_Group ~ CDR + M.F + eTIV
##
## Df Sum of Sq RSS AIC
## + EDUC 1 0.359 17.881 -901.43
## + SES 1 0.257 17.983 -899.62
## <none> 18.240 -897.12
## + nWBV 1 0.071 18.170 -896.35
## + Age 1 0.009 18.231 -895.28
## + MMSE 1 0.004 18.236 -895.20
## + ASF 1 0.004 18.236 -895.19
## - eTIV 1 1.157 19.398 -879.62
## - M.F 1 1.681 19.921 -871.17
## - CDR 1 49.100 67.340 -485.08
##
## Step: AIC=-901.43
## y_Group ~ CDR + M.F + eTIV + EDUC
##
## Df Sum of Sq RSS AIC
## <none> 17.881 -901.43
## + nWBV 1 0.068 17.813 -900.63
## + Age 1 0.020 17.861 -899.78
## + MMSE 1 0.017 17.864 -899.74
## + SES 1 0.014 17.867 -899.68
## + ASF 1 0.013 17.868 -899.66
## - EDUC 1 0.359 18.240 -897.12
## - eTIV 1 0.738 18.619 -890.60
## - M.F 1 1.508 19.389 -877.76
## - CDR 1 46.972 64.853 -495.01
summary(step2)
##
## Call:
## lm(formula = y_Group ~ CDR + M.F + eTIV + EDUC, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1731 -0.1229 -0.0542 0.2064 0.4758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.257e-01 1.340e-01 5.417 1.21e-07 ***
## CDR 1.047e+00 3.657e-02 28.629 < 2e-16 ***
## M.F 1.732e-01 3.375e-02 5.130 5.10e-07 ***
## eTIV -3.397e-04 9.464e-05 -3.589 0.000385 ***
## EDUC -1.219e-02 4.870e-03 -2.504 0.012793 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2394 on 312 degrees of freedom
## Multiple R-squared: 0.7651, Adjusted R-squared: 0.7621
## F-statistic: 254 on 4 and 312 DF, p-value: < 2.2e-16
# y_Group ~ CDR + M.F + eTIV + EDUC (Features selected)
b_model <- lm(y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF)
summary(b_model)
##
## Call:
## lm(formula = y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14908 -0.12500 -0.06085 0.19494 0.47964
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.546490 0.484185 1.129 0.259906
## M.F 0.162088 0.033617 4.822 2.24e-06 ***
## Age -0.003178 0.002085 -1.524 0.128482
## EDUC -0.013056 0.004843 -2.696 0.007405 **
## CDR 1.020255 0.039840 25.609 < 2e-16 ***
## nWBV -0.813532 0.463650 -1.755 0.080311 .
## ASF 0.448722 0.120551 3.722 0.000234 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2387 on 310 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.7634
## F-statistic: 170.9 on 6 and 310 DF, p-value: < 2.2e-16
f_model <- lm(y_Group ~ CDR + M.F + eTIV + EDUC)
summary(f_model)
##
## Call:
## lm(formula = y_Group ~ CDR + M.F + eTIV + EDUC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1731 -0.1229 -0.0542 0.2064 0.4758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.257e-01 1.340e-01 5.417 1.21e-07 ***
## CDR 1.047e+00 3.657e-02 28.629 < 2e-16 ***
## M.F 1.732e-01 3.375e-02 5.130 5.10e-07 ***
## eTIV -3.397e-04 9.464e-05 -3.589 0.000385 ***
## EDUC -1.219e-02 4.870e-03 -2.504 0.012793 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2394 on 312 degrees of freedom
## Multiple R-squared: 0.7651, Adjusted R-squared: 0.7621
## F-statistic: 254 on 4 and 312 DF, p-value: < 2.2e-16
anova(b_model, f_model)
## Analysis of Variance Table
##
## Model 1: y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF
## Model 2: y_Group ~ CDR + M.F + eTIV + EDUC
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 310 17.670
## 2 312 17.881 -2 -0.21112 1.852 0.1587
# ========================================================================
# Convert Group variable to factor
Alzheimer$Group <- as.factor(Alzheimer$Group)
# Cross Validation (CV)
# For 5-fold CV
trControl <- trainControl(method = "cv", number = 5)
#lda
lda.fit <- train(Group ~ CDR + M.F + eTIV + EDUC,
method = "lda",
trControl = trControl,
metric = "Accuracy",
data = Alzheimer)
lda.pred <- predict(lda.fit,Alzheimer)
t1 <- table(lda.pred, Alzheimer$Group)
confusionMatrix(t1)
## Confusion Matrix and Statistics
##
##
## lda.pred 0 1
## 0 188 0
## 1 2 127
##
## Accuracy : 0.9937
## 95% CI : (0.9774, 0.9992)
## No Information Rate : 0.5994
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9869
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9895
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9845
## Prevalence : 0.5994
## Detection Rate : 0.5931
## Detection Prevalence : 0.5931
## Balanced Accuracy : 0.9947
##
## 'Positive' Class : 0
##
# ========================================================================
#glm
glm.fit <- train(Group ~ CDR + M.F + eTIV + EDUC,
method = "glm",
trControl = trControl,
metric = "Accuracy",
data = Alzheimer)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.pred <- predict(glm.fit,Alzheimer)
t2 <- table(glm.pred, Alzheimer$Group)
confusionMatrix(t2)
## Confusion Matrix and Statistics
##
##
## glm.pred 0 1
## 0 188 0
## 1 2 127
##
## Accuracy : 0.9937
## 95% CI : (0.9774, 0.9992)
## No Information Rate : 0.5994
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9869
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9895
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9845
## Prevalence : 0.5994
## Detection Rate : 0.5931
## Detection Prevalence : 0.5931
## Balanced Accuracy : 0.9947
##
## 'Positive' Class : 0
##
# ========================================================================
#knn
knn.fit <- train(Group ~ CDR + M.F + eTIV + EDUC,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = Alzheimer)
knn.pred <- predict(knn.fit,Alzheimer)
t4 <- table(knn.pred, Alzheimer$Group)
confusionMatrix(t4)
## Confusion Matrix and Statistics
##
##
## knn.pred 0 1
## 0 190 0
## 1 0 127
##
## Accuracy : 1
## 95% CI : (0.9884, 1)
## No Information Rate : 0.5994
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5994
## Detection Rate : 0.5994
## Detection Prevalence : 0.5994
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##